# Scrapy settings for sitemap_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'sitemap_crawler'

SPIDER_MODULES = ['sitemap_crawler.spiders']
NEWSPIDER_MODULE = 'sitemap_crawler.spiders'

DEFAULT_HEADERS = {

}

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"

ROBOTSTXT_OBEY = False
SITEMAP = """{
    "_id": "huzheng",
    "startUrl": [
        "http://www.takungpao.com/news/232109/index.html"
    ],
    "selectors": [
        {
            "id": "title",
            "type": "SelectorLink",
            "parentSelectors": [
                "_root",
                "next"
            ],
            "selector": "dd>a[title]",
            "multiple": true,
            "delay": 0
        },
        {
            "id": "next",
            "type": "SelectorLink",
            "parentSelectors": [
                "_root",
                "next"
            ],
            "selector": "a.cms_nextpage",
            "multiple": false,
            "delay": 0
        },
        {
            "id": "content",
            "type": "SelectorText",
            "parentSelectors": [
                "title"
            ],
            "selector": "p:nth-of-type(n+2)",
            "multiple": false,
            "regex": "",
            "delay": 0
        }
    ]
}"""

DOUBAN_SITEMAP = """
{"_id":"douban","startUrl":["https://movie.douban.com/subject/34902639/"],"selectors":[{"id":"title","type":"SelectorText","parentSelectors":["_root"],"selector":"h1 span[property]","multiple":false,"regex":"","delay":0},{"id":"year","type":"SelectorText","parentSelectors":["_root"],"selector":"span.year","multiple":false,"regex":"","delay":0},{"id":"director","type":"SelectorText","parentSelectors":["_root"],"selector":"span:nth-of-type(1) .attrs a","multiple":false,"regex":"","delay":0},{"id":"score","type":"SelectorText","parentSelectors":["_root"],"selector":"strong","multiple":false,"regex":"","delay":0},{"id":"runtime","type":"SelectorText","parentSelectors":["_root"],"selector":"span[property='v:runtime']","multiple":false,"regex":"","delay":0},{"id":"movie_type","type":"SelectorGroup","parentSelectors":["_root"],"selector":"span[property='v:genre']:nth-of-type(n+5)","delay":0,"extractAttribute":""}]}
"""

TENCENT_SITEMAP = """
{"_id":"tencent_news","startUrl":["https://news.qq.com/"],"selectors":[{"id":"news_link","type":"SelectorLink","parentSelectors":["_root"],"selector":"h3 a","multiple":true,"delay":0},{"id":"title","type":"SelectorText","parentSelectors":["news_link"],"selector":"h1","multiple":false,"regex":"","delay":0},{"id":"content","type":"SelectorText","parentSelectors":["news_link"],"selector":"div.content-article","multiple":false,"regex":"","delay":0},{"id":"images","type":"SelectorGroup","parentSelectors":["news_link"],"selector":"p img","delay":0,"extractAttribute":"src"}]}
"""

DAGONG_SITEMAP = """
{"_id":"dagong_news","startUrl":["http://www.takungpao.com/news/232109/index.html"],"selectors":[{"id":"news_url","type":"SelectorLink","parentSelectors":["_root","next_page"],"selector":".item.clearfix dd:nth-of-type(1) a","multiple":true,"delay":0},{"id":"next_page","type":"SelectorLink","parentSelectors":["_root","next_page"],"selector":"a.cms_nextpage","multiple":false,"delay":0},{"id":"news_title","type":"SelectorText","parentSelectors":["news_url"],"selector":"h2.tkp_con_title","multiple":false,"regex":"","delay":0},{"id":"author","type":"SelectorText","parentSelectors":["news_url"],"selector":".tkp_con_author span:nth-of-type(2)","multiple":false,"regex":"","delay":0},{"id":"date","type":"SelectorText","parentSelectors":["news_url"],"selector":".tkp_con_author span:nth-of-type(1)","multiple":false,"regex":"","delay":0},{"id":"content","type":"SelectorGroup","parentSelectors":["news_url"],"selector":"p:nth-of-type(n+2)","delay":0,"extractAttribute":""}]}
"""


FEED_EXPORT_ENCODING = 'utf-8'